package cn.net.withub.dataCollector.common.utils;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by yuanjie on 2018/5/25.
 */
public class HTMLSpirit {

    public static String delHTMLTag(String htmlStr){
        String regEx_script="<script[^>]*?>[\\s\\S]*?<\\/script>"; //定义script的正则表达式
        String regEx_style="<style[^>]*?>[\\s\\S]*?<\\/style>"; //定义style的正则表达式
        String regEx_html="<[^>]+>"; //定义HTML标签的正则表达式

        Pattern p_script=Pattern.compile(regEx_script,Pattern.CASE_INSENSITIVE);
        Matcher m_script=p_script.matcher(htmlStr);
        htmlStr=m_script.replaceAll(""); //过滤script标签

        Pattern p_style=Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
        Matcher m_style=p_style.matcher(htmlStr);
        htmlStr=m_style.replaceAll(""); //过滤style标签

        Pattern p_html=Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
        Matcher m_html=p_html.matcher(htmlStr);
        htmlStr=m_html.replaceAll(""); //过滤html标签

        return htmlStr.trim(); //返回文本字符串
    }

    public static String stripHtml(String content) {
// <p>段落替换为换行
        content = content.replaceAll("<p .*?>", "\r\n");
// <br><br/>替换为换行
        content = content.replaceAll("<br\\s*/?>", "\r\n");
// 去掉其它的<>之间的东西
        content = content.replaceAll("\\<.*?>", "");
// 还原HTML
// content = HTMLDecoder.decode(content);
        return content;
    }

    public static String getTextFromTHML(String htmlStr) {
        Document doc = Jsoup.parse(htmlStr);
        String text = doc.text();
        // remove extra white space
        StringBuilder builder = new StringBuilder(text);
        int index = 0;
        while(builder.length()>index){
            char tmp = builder.charAt(index);
            if(Character.isSpaceChar(tmp) || Character.isWhitespace(tmp)){
                builder.setCharAt(index, ' ');
            }
            index++;
        }
        text = builder.toString().replaceAll(" +", " ").trim();
        return text;
    }

    public static void main(String[] args) throws Exception {
        String s = "<html xmlns:v=\"urn:schemas-microsoft-com:vml\"\n" +
                "xmlns:o=\"urn:schemas-microsoft-com:office:office\"\n" +
                "xmlns:w=\"urn:schemas-microsoft-com:office:word\"\n" +
                "xmlns:st1=\"urn:schemas-microsoft-com:office:smarttags\"\n" +
                "xmlns=\"http://www.w3.org/TR/REC-html40\">\n" +
                "\n" +
                "<head>\n" +
                "<meta http-equiv=Content-Type content=\"text/html; charset=gb2312\">\n" +
                "<meta name=ProgId content=Word.Document>\n" +
                "<meta name=Generator content=\"Microsoft Word 11\">\n" +
                "<meta name=Originator content=\"Microsoft Word 11\">\n" +
                "<link rel=File-List href=\"201407110946355024.files/filelist.xml\">\n" +
                "<link rel=Edit-Time-Data href=\"201407110946355024.files/editdata.mso\">\n" +
                "<!--[if !mso]>\n" +
                "<style>\n" +
                "v\\:* {behavior:url(#default#VML);}\n" +
                "o\\:* {behavior:url(#default#VML);}\n" +
                "w\\:* {behavior:url(#default#VML);}\n" +
                ".shape {behavior:url(#default#VML);}\n" +
                "</style>\n" +
                "<![endif]-->\n" +
                "<title>北碚法院召开退休干部组织生活会</title>\n" +
                "<o:SmartTagType namespaceuri=\"urn:schemas-microsoft-com:office:smarttags\"\n" +
                " name=\"chsdate\"/>\n" +
                "<!--[if gte mso 9]><xml>\n" +
                " <o:DocumentProperties>\n" +
                "  <o:Author>USER</o:Author>\n" +
                "  <o:LastAuthor>wordcovert</o:LastAuthor>\n" +
                "  <o:Revision>2</o:Revision>\n" +
                "  <o:TotalTime>32</o:TotalTime>\n" +
                "  <o:Created>2014-07-11T01:46:00Z</o:Created>\n" +
                "  <o:LastSaved>2014-07-11T01:46:00Z</o:LastSaved>\n" +
                "  <o:Pages>2</o:Pages>\n" +
                "  <o:Words>48</o:Words>\n" +
                "  <o:Characters>275</o:Characters>\n" +
                "  <o:Company>CHINA</o:Company>\n" +
                "  <o:Lines>2</o:Lines>\n" +
                "  <o:Paragraphs>1</o:Paragraphs>\n" +
                "  <o:CharactersWithSpaces>322</o:CharactersWithSpaces>\n" +
                "  <o:Version>11.5606</o:Version>\n" +
                " </o:DocumentProperties>\n" +
                "</xml><![endif]--><!--[if gte mso 9]><xml>\n" +
                " <w:WordDocument>\n" +
                "  <w:PunctuationKerning/>\n" +
                "  <w:DrawingGridVerticalSpacing>7.8 磅</w:DrawingGridVerticalSpacing>\n" +
                "  <w:DisplayHorizontalDrawingGridEvery>0</w:DisplayHorizontalDrawingGridEvery>\n" +
                "  <w:DisplayVerticalDrawingGridEvery>2</w:DisplayVerticalDrawingGridEvery>\n" +
                "  <w:ValidateAgainstSchemas/>\n" +
                "  <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>\n" +
                "  <w:IgnoreMixedContent>false</w:IgnoreMixedContent>\n" +
                "  <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>\n" +
                "  <w:Compatibility>\n" +
                "   <w:SpaceForUL/>\n" +
                "   <w:BalanceSingleByteDoubleByteWidth/>\n" +
                "   <w:DoNotLeaveBackslashAlone/>\n" +
                "   <w:ULTrailSpace/>\n" +
                "   <w:DoNotExpandShiftReturn/>\n" +
                "   <w:AdjustLineHeightInTable/>\n" +
                "   <w:BreakWrappedTables/>\n" +
                "   <w:SnapToGridInCell/>\n" +
                "   <w:WrapTextWithPunct/>\n" +
                "   <w:UseAsianBreakRules/>\n" +
                "   <w:DontGrowAutofit/>\n" +
                "   <w:UseFELayout/>\n" +
                "  </w:Compatibility>\n" +
                "  <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>\n" +
                " </w:WordDocument>\n" +
                "</xml><![endif]--><!--[if gte mso 9]><xml>\n" +
                " <w:LatentStyles DefLockedState=\"false\" LatentStyleCount=\"156\">\n" +
                " </w:LatentStyles>\n" +
                "</xml><![endif]--><!--[if !mso]><object\n" +
                " classid=\"clsid:38481807-CA0E-42D2-BF39-B33AF135CC4D\" id=ieooui></object>\n" +
                "<style>\n" +
                "st1\\:*{behavior:url(#ieooui) }\n" +
                "</style>\n" +
                "<![endif]-->\n" +
                "<style>\n" +
                "<!--\n" +
                " /* Font Definitions */\n" +
                " @font-face\n" +
                "\t{font-family:宋体;\n" +
                "\tpanose-1:2 1 6 0 3 1 1 1 1 1;\n" +
                "\tmso-font-alt:SimSun;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:auto;\n" +
                "\tmso-font-pitch:variable;\n" +
                "\tmso-font-signature:3 135135232 16 0 262145 0;}\n" +
                "@font-face\n" +
                "\t{font-family:黑体;\n" +
                "\tpanose-1:2 1 6 0 3 1 1 1 1 1;\n" +
                "\tmso-font-alt:SimHei;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:auto;\n" +
                "\tmso-font-pitch:variable;\n" +
                "\tmso-font-signature:1 135135232 16 0 262144 0;}\n" +
                "@font-face\n" +
                "\t{font-family:仿宋_GB2312;\n" +
                "\tpanose-1:2 1 6 9 3 1 1 1 1 1;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:modern;\n" +
                "\tmso-font-pitch:fixed;\n" +
                "\tmso-font-signature:1 135135232 16 0 262144 0;}\n" +
                "@font-face\n" +
                "\t{font-family:Simsun;\n" +
                "\tpanose-1:0 0 0 0 0 0 0 0 0 0;\n" +
                "\tmso-font-alt:\"Times New Roman\";\n" +
                "\tmso-font-charset:0;\n" +
                "\tmso-generic-font-family:roman;\n" +
                "\tmso-font-format:other;\n" +
                "\tmso-font-pitch:auto;\n" +
                "\tmso-font-signature:0 0 0 0 0 0;}\n" +
                "@font-face\n" +
                "\t{font-family:\"\\@宋体\";\n" +
                "\tpanose-1:2 1 6 0 3 1 1 1 1 1;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:auto;\n" +
                "\tmso-font-pitch:variable;\n" +
                "\tmso-font-signature:3 135135232 16 0 262145 0;}\n" +
                "@font-face\n" +
                "\t{font-family:\"\\@黑体\";\n" +
                "\tpanose-1:2 1 6 0 3 1 1 1 1 1;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:auto;\n" +
                "\tmso-font-pitch:variable;\n" +
                "\tmso-font-signature:1 135135232 16 0 262144 0;}\n" +
                "@font-face\n" +
                "\t{font-family:\"\\@仿宋_GB2312\";\n" +
                "\tpanose-1:2 1 6 9 3 1 1 1 1 1;\n" +
                "\tmso-font-charset:134;\n" +
                "\tmso-generic-font-family:modern;\n" +
                "\tmso-font-pitch:fixed;\n" +
                "\tmso-font-signature:1 135135232 16 0 262144 0;}\n" +
                " /* Style Definitions */\n" +
                " p.MsoNormal, li.MsoNormal, div.MsoNormal\n" +
                "\t{mso-style-parent:\"\";\n" +
                "\tmargin:0cm;\n" +
                "\tmargin-bottom:.0001pt;\n" +
                "\ttext-align:justify;\n" +
                "\ttext-justify:inter-ideograph;\n" +
                "\tmso-pagination:none;\n" +
                "\tfont-size:10.5pt;\n" +
                "\tmso-bidi-font-size:12.0pt;\n" +
                "\tfont-family:\"Times New Roman\";\n" +
                "\tmso-fareast-font-family:宋体;\n" +
                "\tmso-font-kerning:1.0pt;}\n" +
                "p.MsoHeader, li.MsoHeader, div.MsoHeader\n" +
                "\t{margin:0cm;\n" +
                "\tmargin-bottom:.0001pt;\n" +
                "\ttext-align:center;\n" +
                "\tmso-pagination:none;\n" +
                "\ttab-stops:center 207.65pt right 415.3pt;\n" +
                "\tlayout-grid-mode:char;\n" +
                "\tborder:none;\n" +
                "\tmso-border-bottom-alt:solid windowtext .75pt;\n" +
                "\tpadding:0cm;\n" +
                "\tmso-padding-alt:0cm 0cm 1.0pt 0cm;\n" +
                "\tfont-size:9.0pt;\n" +
                "\tfont-family:\"Times New Roman\";\n" +
                "\tmso-fareast-font-family:宋体;\n" +
                "\tmso-font-kerning:1.0pt;}\n" +
                "p.MsoFooter, li.MsoFooter, div.MsoFooter\n" +
                "\t{margin:0cm;\n" +
                "\tmargin-bottom:.0001pt;\n" +
                "\tmso-pagination:none;\n" +
                "\ttab-stops:center 207.65pt right 415.3pt;\n" +
                "\tlayout-grid-mode:char;\n" +
                "\tfont-size:9.0pt;\n" +
                "\tfont-family:\"Times New Roman\";\n" +
                "\tmso-fareast-font-family:宋体;\n" +
                "\tmso-font-kerning:1.0pt;}\n" +
                "p\n" +
                "\t{mso-margin-top-alt:auto;\n" +
                "\tmargin-right:0cm;\n" +
                "\tmso-margin-bottom-alt:auto;\n" +
                "\tmargin-left:0cm;\n" +
                "\tmso-pagination:widow-orphan;\n" +
                "\tfont-size:12.0pt;\n" +
                "\tfont-family:宋体;\n" +
                "\tmso-bidi-font-family:宋体;}\n" +
                " /* Page Definitions */\n" +
                " @page\n" +
                "\t{mso-page-border-surround-header:no;\n" +
                "\tmso-page-border-surround-footer:no;\n" +
                "\tmso-footnote-separator:url(\"201407110946355024.files/header.htm\") fs;\n" +
                "\tmso-footnote-continuation-separator:url(\"201407110946355024.files/header.htm\") fcs;\n" +
                "\tmso-endnote-separator:url(\"201407110946355024.files/header.htm\") es;\n" +
                "\tmso-endnote-continuation-separator:url(\"201407110946355024.files/header.htm\") ecs;}\n" +
                "@page Section1\n" +
                "\t{size:595.3pt 841.9pt;\n" +
                "\tmargin:72.0pt 90.0pt 72.0pt 90.0pt;\n" +
                "\tmso-header-margin:42.55pt;\n" +
                "\tmso-footer-margin:49.6pt;\n" +
                "\tmso-paper-source:0;\n" +
                "\tlayout-grid:15.6pt;}\n" +
                "div.Section1\n" +
                "\t{page:Section1;}\n" +
                "-->\n" +
                "</style>\n" +
                "<!--[if gte mso 10]>\n" +
                "<style>\n" +
                " /* Style Definitions */\n" +
                " table.MsoNormalTable\n" +
                "\t{mso-style-name:普通表格;\n" +
                "\tmso-tstyle-rowband-size:0;\n" +
                "\tmso-tstyle-colband-size:0;\n" +
                "\tmso-style-noshow:yes;\n" +
                "\tmso-style-parent:\"\";\n" +
                "\tmso-padding-alt:0cm 5.4pt 0cm 5.4pt;\n" +
                "\tmso-para-margin:0cm;\n" +
                "\tmso-para-margin-bottom:.0001pt;\n" +
                "\tmso-pagination:widow-orphan;\n" +
                "\tfont-size:10.0pt;\n" +
                "\tfont-family:\"Times New Roman\";\n" +
                "\tmso-ansi-language:#0400;\n" +
                "\tmso-fareast-language:#0400;\n" +
                "\tmso-bidi-language:#0400;}\n" +
                "</style>\n" +
                "<![endif]--><!--[if gte mso 9]><xml>\n" +
                " <o:shapedefaults v:ext=\"edit\" spidmax=\"3074\"/>\n" +
                "</xml><![endif]--><!--[if gte mso 9]><xml>\n" +
                " <o:shapelayout v:ext=\"edit\">\n" +
                "  <o:idmap v:ext=\"edit\" data=\"2\"/>\n" +
                " </o:shapelayout></xml><![endif]-->\n" +
                "</head>\n" +
                "\n" +
                "<body lang=ZH-CN style='tab-interval:21.0pt;text-justify-trim:punctuation'>\n" +
                "\n" +
                "<div class=Section1 style='layout-grid:15.6pt'>\n" +
                "\n" +
                "<p class=MsoNormal align=center style='text-align:center'><b style='mso-bidi-font-weight:\n" +
                "normal'><span style='font-size:16.0pt;font-family:黑体'>北碚法院召开退休干部组织生活会<span\n" +
                "lang=EN-US><o:p></o:p></span></span></b></p>\n" +
                "\n" +
                "<p class=MsoNormal align=left style='mso-margin-top-alt:auto;mso-margin-bottom-alt:\n" +
                "auto;text-align:left;text-indent:32.0pt;line-height:18.75pt;mso-pagination:\n" +
                "widow-orphan;background:white'><st1:chsdate IsROCDate=\"False\"\n" +
                "IsLunarDate=\"False\" Day=\"10\" Month=\"7\" Year=\"2014\" w:st=\"on\"><span lang=EN-US\n" +
                " style='font-size:14.0pt;font-family:仿宋_GB2312;mso-hansi-font-family:Simsun;\n" +
                " mso-bidi-font-family:宋体;color:#333333;mso-font-kerning:0pt'>7</span><span\n" +
                " style='font-size:14.0pt;font-family:仿宋_GB2312;mso-hansi-font-family:Simsun;\n" +
                " mso-bidi-font-family:宋体;color:#333333;mso-font-kerning:0pt'>月<span lang=EN-US>10</span>日</span></st1:chsdate><span\n" +
                "style='font-size:14.0pt;font-family:仿宋_GB2312;mso-hansi-font-family:Simsun;\n" +
                "mso-bidi-font-family:宋体;color:#333333;mso-font-kerning:0pt'>下午，北碚法院召开退休干部组织生活会。<span\n" +
                "lang=EN-US><o:p></o:p></span></span></p>\n" +
                "\n" +
                "<p class=MsoNormal align=left style='mso-margin-top-alt:auto;mso-margin-bottom-alt:\n" +
                "auto;text-align:left;text-indent:32.0pt;line-height:18.75pt;mso-pagination:\n" +
                "widow-orphan;background:white'><span style='font-size:14.0pt;font-family:仿宋_GB2312;\n" +
                "mso-hansi-font-family:Simsun;mso-bidi-font-family:宋体;color:#333333;mso-font-kerning:\n" +
                "0pt'>我院<span lang=EN-US>37</span>位退休老领导、老干部、老党员们齐聚一堂，畅所欲言。会上，政治处主任廖英伟向退休老干部汇报了我院近期党的群众路线教育实践活动开展情况、民主生活会开展情况以及贯彻落实中央八项规定精神和转变作风方面的基本情况，并对老党员在履行党员义务、参与社会活动等方面提出了新要求。<span\n" +
                "lang=EN-US><o:p></o:p></span></span></p>\n" +
                "\n" +
                "<p class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;\n" +
                "text-indent:32.0pt;line-height:18.75pt;mso-pagination:widow-orphan;background:\n" +
                "white'><span style='font-size:14.0pt;font-family:仿宋_GB2312;mso-hansi-font-family:\n" +
                "Simsun;mso-bidi-font-family:宋体;color:#333333;mso-font-kerning:0pt'>老干部们纷纷肯定了上级组织和法院各项工作的新进步、开展党的群众路线教育实践活动的好做法以及对老干部生活的关心。老同志们纷纷表示，要认真学习习近平同志重要论述和学习贯彻习近平重要讲话精神，政治上、思想上、行动上与党中央保持一致，积极献计献策，发挥余热，贡献自己的力量。（政治处<span\n" +
                "lang=EN-US><span style='mso-spacerun:yes'>&nbsp; </span></span>刘金丽）<span\n" +
                "lang=EN-US><o:p></o:p></span></span></p>\n" +
                "\n" +
                "<p class=MsoNormal align=center style='mso-margin-top-alt:auto;mso-margin-bottom-alt:\n" +
                "auto;text-align:center;text-indent:32.0pt;line-height:18.75pt;mso-pagination:\n" +
                "widow-orphan;background:white'><span lang=EN-US style='font-size:14.0pt;\n" +
                "font-family:仿宋_GB2312;mso-hansi-font-family:Simsun;mso-bidi-font-family:宋体;\n" +
                "color:#333333;mso-font-kerning:0pt'><!--[if gte vml 1]><v:shapetype id=\"_x0000_t75\"\n" +
                " coordsize=\"21600,21600\" o:spt=\"75\" o:preferrelative=\"t\" path=\"m@4@5l@4@11@9@11@9@5xe\"\n" +
                " filled=\"f\" stroked=\"f\">\n" +
                " <v:stroke joinstyle=\"miter\"/>\n" +
                " <v:formulas>\n" +
                "  <v:f eqn=\"if lineDrawn pixelLineWidth 0\"/>\n" +
                "  <v:f eqn=\"sum @0 1 0\"/>\n" +
                "  <v:f eqn=\"sum 0 0 @1\"/>\n" +
                "  <v:f eqn=\"prod @2 1 2\"/>\n" +
                "  <v:f eqn=\"prod @3 21600 pixelWidth\"/>\n" +
                "  <v:f eqn=\"prod @3 21600 pixelHeight\"/>\n" +
                "  <v:f eqn=\"sum @0 0 1\"/>\n" +
                "  <v:f eqn=\"prod @6 1 2\"/>\n" +
                "  <v:f eqn=\"prod @7 21600 pixelWidth\"/>\n" +
                "  <v:f eqn=\"sum @8 21600 0\"/>\n" +
                "  <v:f eqn=\"prod @7 21600 pixelHeight\"/>\n" +
                "  <v:f eqn=\"sum @10 21600 0\"/>\n" +
                " </v:formulas>\n" +
                " <v:path o:extrusionok=\"f\" gradientshapeok=\"t\" o:connecttype=\"rect\"/>\n" +
                " <o:lock v:ext=\"edit\" aspectratio=\"t\"/>\n" +
                "</v:shapetype><v:shape id=\"_x0000_i1025\" type=\"#_x0000_t75\" style='width:415.5pt;\n" +
                " height:276.75pt'>\n" +
                " <v:imagedata src=\"201407110946355024.files/image001.jpg\" o:title=\"IMG_5006_副本\"/>\n" +
                "</v:shape><![endif]--><![if !vml]><img width=554 height=369\n" +
                "src=\"201407110946355024.files/image002.jpg\" v:shapes=\"_x0000_i1025\"><![endif]><o:p></o:p></span></p>\n" +
                "\n" +
                "</div>\n" +
                "\n" +
                "</body>\n" +
                "\n" +
                "</html>\n";


        s = "<html><title>张三</title><body><div>a啊实打实多撒撒大声地撒多所</div></body></html>";

        s = Jsoup.parse(s).text();

        System.out.print("s； "+s);
    }
}
